{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "https://nlpforhackers.io/complete-guide-to-spacy/\n",
    "\n",
    "https://spacy.io/usage/linguistic-features#example3\n",
    "\n",
    "https://spacy.io/usage/examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/usr/bin/python: No module named pathlib; 'spacy' is a package and cannot be directly executed\n"
     ]
    }
   ],
   "source": [
    "!pip install -U spacy --user"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Make sure to run this first:\n",
    "    \n",
    "    python -m spacy download en"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package vader_lexicon to\n",
      "[nltk_data]     /Users/itpstudent/nltk_data...\n",
      "[nltk_data]   Package vader_lexicon is already up-to-date!\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import nltk\n",
    "nltk.download('vader_lexicon')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'neg': 0.0, 'neu': 0.715, 'pos': 0.285, 'compound': 0.6115}\n"
     ]
    }
   ],
   "source": [
    "from __future__ import unicode_literals\n",
    "import spacy\n",
    "from spacy.tokens import Doc\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "\n",
    "\n",
    "sentiment_analyzer = SentimentIntensityAnalyzer()\n",
    "def polarity_scores(doc):\n",
    "    return sentiment_analyzer.polarity_scores(doc.text)\n",
    " \n",
    "#Doc.set_extension('polarity_scores', getter=polarity_scores)\n",
    " \n",
    "nlp = spacy.load('en')\n",
    "doc = nlp(\"I'm so happy that finally all of us are together again\")\n",
    "print(doc._.polarity_scores)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(u'CARDINAL', u'2')\n",
      "(u'TIME', u'9 a.m.')\n",
      "(u'PERCENT', u'30%')\n",
      "(u'DATE', u'just 2 days')\n",
      "(u'ORG', u'WSJ')\n"
     ]
    }
   ],
   "source": [
    "\n",
    "doc = nlp(\"I just bought 2 shares at 9 a.m. because the stock went up 30% in just 2 days according to the WSJ\")\n",
    "\n",
    "for ent in doc.ents:\n",
    "    print(ent.label_, ent.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(u'Next', u'JJ'), (u'week', u'NN'), (u'I', u'PRP'), (u\"'ll\", u'MD'), (u'be', u'VB'), (u'in', u'IN'), (u'Madrid', u'NNP'), (u'.', u'.')]\n"
     ]
    }
   ],
   "source": [
    "doc = nlp(\"Next week I'll be in Madrid.\")\n",
    "print([(token.text, token.tag_) for token in doc])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.792721354574\n",
      "0.798225896087\n",
      "0.801580147422\n",
      "0.528548488716\n"
     ]
    }
   ],
   "source": [
    "target = nlp(\"Cats are beautiful animals.\")\n",
    "doc1 = nlp(\"Dogs are awesome.\")\n",
    "doc2 = nlp(\"Some gorgeous creatures are felines.\")\n",
    "doc3 = nlp(\"Dolphins are swimming mammals.\")\n",
    "doc4 = nlp(\"I'm not happy with molecular physics and other such business\")\n",
    "print(target.similarity(doc1))  # 0.8901765218466683\n",
    "print(target.similarity(doc2))  # 0.9115828449161616\n",
    "print(target.similarity(doc3))  # 0.7822956752876101\n",
    "print(target.similarity(doc4))  # 0.7822956752876101"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(u'HAPPY', u'\\U0001f600')\n",
      "(u'HASHTAG', u'#MondayMotivation')\n"
     ]
    }
   ],
   "source": [
    "from spacy.lang.en import English\n",
    "from spacy.matcher import Matcher\n",
    "\n",
    "nlp = English()  # we only want the tokenizer, so no need to load a model\n",
    "matcher = Matcher(nlp.vocab)\n",
    "\n",
    "pos_emoji = [u'😀', u'😃', u'😂', u'🤣', u'😊', u'😍']  # positive emoji\n",
    "neg_emoji = [u'😞', u'😠', u'😩', u'😢', u'😭', u'😒']  # negative emoji\n",
    "\n",
    "# add patterns to match one or more emoji tokens\n",
    "pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emoji]\n",
    "neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emoji]\n",
    "\n",
    "# function to label the sentiment\n",
    "def label_sentiment(matcher, doc, i, matches):\n",
    "    match_id, start, end = matches[i]\n",
    "    if doc.vocab.strings[match_id] == 'HAPPY':  # don't forget to get string!\n",
    "        doc.sentiment += 0.1  # add 0.1 for positive sentiment\n",
    "    elif doc.vocab.strings[match_id] == 'SAD':\n",
    "        doc.sentiment -= 0.1  # subtract 0.1 for negative sentiment\n",
    "\n",
    "matcher.add('HAPPY', label_sentiment, *pos_patterns)  # add positive pattern\n",
    "matcher.add('SAD', label_sentiment, *neg_patterns)  # add negative pattern\n",
    "\n",
    "# add pattern for valid hashtag, i.e. '#' plus any ASCII token\n",
    "matcher.add('HASHTAG', None, [{'ORTH': '#'}, {'IS_ASCII': True}])\n",
    "\n",
    "doc = nlp(u\"Hello world 😀 #MondayMotivation\")\n",
    "matches = matcher(doc)\n",
    "for match_id, start, end in matches:\n",
    "    string_id = doc.vocab.strings[match_id]  # look up string ID\n",
    "    span = doc[start:end]\n",
    "    print(string_id, span.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
